########################################################
## PROGRAM NAME: 201_desc_analysis.R                  ##
## AUTHOR: MATT MLECZKO                               ##
## INPUTS:                                            ##
##         100_af_msa.Rda                             ##
##         100_af_muni.Rda                            ##
##         005_msas_keep.Rda                          ##
##         005_cc_out.Rda                             ##
##         101_nzlu_t1_sp_res.Rda                     ##
##         101_nzlu_t2_sp_res.Rda                     ##
##         002_tract_to_cs.Rda                        ##
##         002_tract_to_pl.Rda                        ##
##                                                    ##
## OUTPUTS:                                           ##
##         201_af_muni_fin.Rda                        ##
##         201_af_msa_fin.Rda                         ##
##                                                    ## 
## PURPOSE: Final processing and some                 ##
##          descriptive analyses                      ##
##                                                    ##
## LIST OF UPDATES:                                   ##
########################################################

#log <- file(# USER DEFINED PATH AND FILE NAME HERE #)
#sink(log, append=TRUE)
#sink(log, append=TRUE, type="message")

## load libraries ##

library(tidycensus)
library(tidyverse)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(gsubfn)
library(haven)
library(readxl)
library(car)

## define paths ##
pr_path <- # USER DEFINED PATH #

## set working directory ##
setwd(pr_path)

`%notin%` <- Negate(`%in%`)

## create a merge function that creates merge frequency as in Stata ##
## from user rwbuie at stackoverflow: https://stackoverflow.com/questions/30358401/is-there-a-way-to-create-statas-merge-indicator-variable-with-rs-merge ##
stata.merge <- function(x,y, name){
  x$df1 <- 1
  y$df2 <- 2
  df <- merge(x,y, by = name, all = TRUE)
  df$merge.variable <- rowSums(df[,c("df1", "df2")], na.rm=TRUE)
  df$df1 <- NULL
  df$df2<- NULL
  df
  #print(table(df$merge.variable))
  
  ## return the merged dataframe
  return(df)
}

## load analytic files ##

load("100_af_msa.Rda")
load("100_af_muni.Rda")
load("005_msas_keep.Rda")
load("005_cc_out.Rda")
load("101_nzlu_t1_sp_res.Rda")
load("101_nzlu_t2_sp_res.Rda")
load("002_tract_to_cs.Rda")
load("002_tract_to_pl.Rda")


af.muni.add1.m <- stata.merge(af.muni.metro,
                              nzlu.munis.t1.spst, 
                              "GEOID_full")

table(af.muni.add1.m$merge.variable)

## keep matches ##
af.muni.add1 <- af.muni.add1.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

af.muni.add2.m <- stata.merge(af.muni.add1,
                              nzlu.munis.t2.spst, 
                              "GEOID_full")

table(af.muni.add2.m$merge.variable)

## keep matches ##
af.muni.add2 <- af.muni.add2.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

af.muni.fin <- af.muni.add2 %>%
  mutate(zri_2006_cb = zri_st_2006 + zri_st_2006_median,
         zri_2022_cb = zri_st_2022 + zri_st_2022_median) %>%
  select(GEOID,
         NAME,
         ends_with("_2009"),
         ends_with("_2022"),
         ends_with("_2006"),
         starts_with("rb_"),
         starts_with("mgr_"),
         starts_with("mpv_"),
         zri_st_2006_median,
         zri_2006_cb,
         zri_st_2022_median,
         zri_2022_cb) 

sapply(af.muni.fin, function(x) sum(is.na(x)))
sum(is.na(af.muni.fin$zri_st_2006_median) & !is.na(af.muni.fin$zri_st_2006))
sum(!is.na(af.muni.fin$zri_st_2006_median) & is.na(af.muni.fin$zri_st_2006))


summary(af.muni.fin)
summary(af.muni.fin$zri_2006_cb)
summary(af.muni.fin$zri_2022_cb)
length(unique(af.muni.fin$GEOID)) == nrow(af.muni.fin)

save(af.muni.fin,
    file = "201_af_muni_fin.Rda")

## aggregate ZRI to MSA ##


ptm <- tract.to.pl.metro %>%
  mutate(GEOID_muni = paste0(`FIPS State Code`,
                             placefp)) %>%
  select(GEOID_muni,
         placefp,
         placenm,
         cbsa10,
         `CBSA Title`) %>%
  unique() %>%
  rename(GEOID = GEOID_muni,
         cbsaname10 = `CBSA Title`)

ctm <- tract.to.cs.metro %>%
  mutate(GEOID_muni = paste0(`FIPS State Code`,
                             cousubfp),
         GEOID_muni_full = paste0(`FIPS State Code`,
                                  `FIPS County Code`,
                                  cousubfp)) %>%
  filter(GEOID_muni %notin% ptm$GEOID) %>%
  select(GEOID_muni,
         GEOID_muni_full,
         cousubfp,
         cousubnm,
         cbsa10,
         `CBSA Title`) %>%
  unique() %>%
  rename(GEOID = GEOID_muni,
         GEOID_full = GEOID_muni_full,
         cbsaname10 = `CBSA Title`)

## merge checks ## 
nrow(af.muni.fin) == length(unique(af.muni.fin$GEOID))
class(af.muni.fin$GEOID)
range(nchar(trim(af.muni.fin$GEOID)))

nrow(ptm) == length(unique(ptm$GEOID))
class(ptm$GEOID)
range(nchar(trim(ptm$GEOID)))

## merge data frames ## 
af.muni.msa.m1 <- stata.merge(af.muni.fin,
                              ptm,
                              "GEOID")

## check merge ## 
table(af.muni.msa.m1$merge.variable)

## output non-matches eligible for match with county subs ##
af.muni.msa.nm1 <- af.muni.msa.m1 %>%
  filter(merge.variable ==1) %>%
  select(-placefp,
         -placenm,
         -cbsa10,
         -cbsaname10,
         -merge.variable)

## keep matches ## 
muni.msa.keep1 <- af.muni.msa.m1 %>%
  filter(merge.variable ==3) %>%
  select(-placefp,
         -placenm,
         -merge.variable) %>%
  mutate(GEOID_full = GEOID)

## check for dups ## 
## these are places that span multiple MSAs ##
muni.msa.dupcheck1 <- muni.msa.keep1 %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## now, county subs ## 

## merge checks ##
nrow(ctm) == length(unique(ctm$GEOID))
class(ctm$GEOID)
range(nchar(trim(ctm$GEOID)))

nrow(af.muni.msa.nm1) == length(unique(af.muni.msa.nm1$GEOID))
class(af.muni.msa.nm1$GEOID)
range(nchar(trim(af.muni.msa.nm1$GEOID)))

## merge data frames ## 
muni.msa.cs.merge <- stata.merge(af.muni.msa.nm1,
                                 ctm,
                                 "GEOID")

## check merge ##
table(muni.msa.cs.merge$merge.variable)

## keep matches ## 
muni.msa.keep2 <- muni.msa.cs.merge %>%
  filter(merge.variable ==3) %>%
  select(-cousubfp,
         -cousubnm,
         -merge.variable)

## check for dups ## 
## these are places that span multiple MSAs ##
muni.msa.dupcheck2 <- muni.msa.keep2 %>%
  group_by(GEOID) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## append the two matched dataframes ##

nzlu.wmsas <- rbind(muni.msa.keep1,
                    muni.msa.keep2)

nzlu.wmsas.fin.dupcheck <- nzlu.wmsas %>%
  group_by(GEOID,cbsa10) %>%
  summarize(n=n(), .groups = "drop") %>%
  filter(n>1)

## create MSA level file ## 

nzlu.msa.pt1 <- nzlu.wmsas %>%
  rename(CBSA = cbsa10) %>%
  group_by(CBSA) %>%
  summarize(cbsaname10 = cbsaname10[which(cbsaname10 != "")[1]],
            responses = n(),
            zri_2006_median = median(zri_2006, na.rm=T),
            zri_2006_range = abs(max(zri_2006, na.rm=T) - min(zri_2006,na.rm=T)),
            zri_2022_median = median(zri_2022, na.rm=T),
            zri_2022_range = abs(max(zri_2022, na.rm=T) - min(zri_2022,na.rm=T)), .groups = "drop") %>%
  mutate(zri_2006_median_st = (zri_2006_median - mean(zri_2006_median, na.rm=T))/sd(zri_2006_median,na.rm=T),
         zri_2006_range_st = (zri_2006_range - mean(zri_2006_range, na.rm=T))/sd(zri_2006_range,na.rm=T),
         zri_2022_median_st = (zri_2022_median - mean(zri_2022_median, na.rm=T))/sd(zri_2022_median,na.rm=T),
         zri_2022_range_st = (zri_2022_range - mean(zri_2022_range, na.rm=T))/sd(zri_2022_range,na.rm=T))

summary(nzlu.msa.pt1)

nzlu.msa.pt1look <- nzlu.msa.pt1 %>%
  select(CBSA,
         cbsaname10,
         zri_2006_median,
         zri_2006_median_st,
         zri_2006_range,
         zri_2006_range_st,
         zri_2022_median,
         zri_2022_median_st,
         zri_2022_range,
         zri_2022_range_st)

## now, let's create the second msa level ZRI dimension ##

## merge on central cities ##

length(unique(nzlu.wmsas$GEOID)) == nrow(nzlu.wmsas)
range(nchar(nzlu.wmsas$GEOID))

length(unique(cc.out$GEOID)) == nrow(cc.out)
range(nchar(cc.out$GEOID))


nzlu.msa.pt2.m <- stata.merge(nzlu.wmsas,
                              cc.out,
                              "GEOID")

## check merge ##
table(nzlu.msa.pt2.m$merge.variable, useNA = "ifany")

## keep all obs, create cc indicator ## 

nzlu.msa.pt2.temp <- nzlu.msa.pt2.m %>%
  filter(merge.variable %in% c(1,3)) %>%
  mutate(cc = ifelse(merge.variable == 3, 1, 0)) %>%
  select(-merge.variable) %>%
  rename(CBSA = cbsa10)

nzlu.msa.pt2a <- nzlu.msa.pt2.temp %>%
  filter(cc == 1) %>%
  group_by(CBSA) %>%
  summarize(cbsaname10 = first(cbsaname10),
            cc_zri_2006 = median(zri_2006,na.rm=T),
            cc_zri_2022 = median(zri_2022,na.rm=T), .groups = "drop")

nzlu.msa.pt2b <- nzlu.msa.pt2.temp %>%
  filter(cc == 0) %>%
  group_by(CBSA) %>%
  summarize(sub_zri_2006_max = max(zri_2006,na.rm=T),
            sub_zri_2022_max = max(zri_2022,na.rm=T),
            sub_zri_2006_min = min(zri_2006,na.rm=T),
            sub_zri_2022_min = min(zri_2022,na.rm=T), .groups = "drop")

nzlu.msa.pt2b[nzlu.msa.pt2b == -Inf] <- NA
nzlu.msa.pt2b[nzlu.msa.pt2b == Inf] <- NA

print("ROWS PRIOR TO NZLU PT 2")
nrow(nzlu.msa.pt2a)
nrow(nzlu.msa.pt2b)

nzlu.msa.pt2 <- nzlu.msa.pt2a %>%
  left_join(nzlu.msa.pt2b,"CBSA") %>%
  select(CBSA,
         cc_zri_2006,
         cc_zri_2022,
         sub_zri_2006_min,
         sub_zri_2022_min,
         sub_zri_2006_max,
         sub_zri_2022_max) %>%
  mutate(zri_2006_diff = ifelse(!is.na(sub_zri_2006_max),
                                sub_zri_2006_max - cc_zri_2006,
                                NA),
         zri_2022_diff = ifelse(!is.na(sub_zri_2022_max),
                                sub_zri_2022_max - cc_zri_2022,
                                NA))

print("ROWS - NZLU PT 1")
nrow(nzlu.msa.pt1)
print("ROWS - NZLU PT 2")
nrow(nzlu.msa.pt2)

## merge ## 

nzlu.msa.final <- nzlu.msa.pt1 %>%
  inner_join(nzlu.msa.pt2, "CBSA") %>%
  inner_join(msas.keep, "CBSA") %>%
  mutate(zri_2006_diff_fin = ifelse(is.na(zri_2006_diff),
                                    zri_2006_range,
                                    zri_2006_diff),
         zri_2022_diff_fin = ifelse(is.na(zri_2022_diff),
                                    zri_2022_range,
                                    zri_2022_diff),
         zri_full_2006 = zri_2006_median + zri_2006_diff_fin,
         zri_full_2022 = zri_2022_median + zri_2022_diff_fin) %>%
  mutate(zri_full_2006_st = (zri_full_2006 - mean(zri_full_2006, na.rm=T))/sd(zri_full_2006,na.rm=T),
         zri_full_2022_st = (zri_full_2022 - mean(zri_full_2022, na.rm=T))/sd(zri_full_2022,na.rm=T))

print("NZLU FINAL ROWS")
nrow(nzlu.msa.final)

summary(nzlu.msa.final) 
sum(is.na(nzlu.msa.final$zri_2006_diff) & !is.na(nzlu.msa.final$zri_2006_range))
sum(is.na(nzlu.msa.final$zri_2022_diff) & !is.na(nzlu.msa.final$zri_2022_range))

## check ##

nzlu.msa.finlook <- nzlu.msa.final %>%
  select(CBSA,
         cbsaname10,
         zri_2006_median,
         zri_2006_diff,
         zri_full_2006,
         zri_full_2006_st,
         zri_2022_median,
         zri_2022_diff,
         zri_full_2022,
         zri_full_2022_st) 

## bring on covariates and outcomes ##

af.msa.fin.m <- stata.merge(nzlu.msa.final,
                            af.msa,
                            "CBSA")

## check merge ##
table(af.msa.fin.m$merge.variable)

## keep matches ##
af.msa.fin <- af.msa.fin.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(treat_2006 = ifelse(zri_full_2006_st > mean(zri_full_2006_st), 1,0),
         treat_2022 = ifelse(zri_full_2022_st > mean(zri_full_2022_st), 1,0),
         pop_density_2009_sq = pop_density_2009^2,
         pop_density_2022_sq = pop_density_2022^2)

save(af.msa.fin,
     file = "201_af_msa_fin.Rda")

## END OF PROGRAM ##

#sink()